A neuron looks something like this
Symbolically we can represent the key parts we want to model as
We can model the activity in each neuron in various ways
In order to build an artifical "brain" we need to connect together many neurons in a "neural network"
So mathematically the activation of each neuron can be represented by
where $W$ and $b$ are the weights and bias respectively.
Theano or TensorFlow as backendLet's load some data
In [ ]:
from __future__ import absolute_import, print_function, division
from ipywidgets import interact, interactive, widgets
import numpy as np
np.random.seed(1337) # for reproducibility
In [ ]:
from keras.datasets import mnist
(images_train, labels_train), (images_test, labels_test) = mnist.load_data()
print("Data shapes:")
print('images',images_train.shape)
print('labels', labels_train.shape)
and then visualise it
In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
In [ ]:
def plot_mnist_digit(image, figsize=None):
""" Plot a single MNIST image."""
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
if figsize:
ax.set_figsize(*figsize)
ax.matshow(image, cmap = matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
In [ ]:
def plot_1_by_2_images(image, reconstruction, figsize=None):
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(1, 2, 1)
ax.matshow(image, cmap = matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
ax = fig.add_subplot(1, 2, 2)
ax.matshow(reconstruction, cmap = matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
In [ ]:
def plot_10_by_10_images(images, figsize=None):
""" Plot 100 MNIST images in a 10 by 10 table. Note that we crop
the images so that they appear reasonably close together. The
image is post-processed to give the appearance of being continued."""
fig = plt.figure(figsize=figsize)
#images = [image[3:25, 3:25] for image in images]
#image = np.concatenate(images, axis=1)
for x in range(10):
for y in range(10):
ax = fig.add_subplot(10, 10, 10*y+x+1)
ax.matshow(images[10*y+x], cmap = matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
In [ ]:
def draw_image(i):
plot_mnist_digit(images_train[i])
print('index:', i, 'label:', labels_train[i])
interact(draw_image, i=(0, len(images_train)-1))
In [ ]:
plot_10_by_10_images(images_train, figsize=(10,10))
In [ ]:
def to_features(X):
return X.reshape(-1, 784).astype("float32") / 255.0
def to_images(X):
return (X*255.0).astype('uint8').reshape(-1, 28, 28)
#print((images_train[0]-(to_images(to_features(images_train[0])))).max())
print('data shape:', images_train.shape, images_train.dtype)
print('features shape', to_features(images_train).shape, to_features(images_train).dtype)
In [ ]:
# the data, shuffled and split between train and test sets
X_train = to_features(images_train)
X_test = to_features(images_test)
print(X_train.shape, 'training samples')
print(X_test.shape, 'test samples')
The labels we transform to a "one-hot" encoding
In [ ]:
# The labels need to be transformed into class indicators
from keras.utils import np_utils
y_train = np_utils.to_categorical(labels_train, nb_classes=10)
y_test = np_utils.to_categorical(labels_test, nb_classes=10)
print('labels_train:', labels_train.shape, labels_train.dtype)
print('y_train:', y_test.shape, y_train.dtype)
For example, let's inspect the first 3 labels:
In [ ]:
print('labels_train[:3]:', labels_train[:3])
print('y_train[:3]', y_train[:3])
In [ ]:
# Neural Network Architecture Parameters
nb_input = 784
nb_hidden = 512
nb_output = 10
# Training Parameters
nb_epoch = 1
batch_size = 128
First we define the "architecture" of the network
In [ ]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
mlp = Sequential()
mlp.add(Dense(output_dim=nb_hidden, input_dim=nb_input, init='uniform'))
mlp.add(Activation('sigmoid'))
mlp.add(Dense(output_dim=nb_output, input_dim=nb_hidden, init='uniform'))
mlp.add(Activation('softmax'))
then we compile it. This takes the symbolic computational graph of the model and compiles it an efficient implementation which can then be used to train and evaluate the model.
Note that we have to specify what loss/objective function we want to use as well which optimisation algorithm to use. SGD stands for Stochastic Gradient Descent.
In [ ]:
mlp.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=["accuracy"])
Next we train the model on our training data. Watch the loss, which is the objective function which we are minimising, and the estimated accuracy of the model.
In [ ]:
mlp.fit(X_train, y_train,
batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1)
Once the model is trained, we can evaluate its performance on the test data.
In [ ]:
mlp.evaluate(X_test, y_test)
In [ ]:
def draw_mlp_prediction(j):
plot_mnist_digit(to_images(X_test)[j])
prediction = mlp.predict_classes(X_test[j:j+1], verbose=False)[0]
print(j, ':', '\tpredict:', prediction, '\tactual:', labels_test[j])
interact(draw_mlp_prediction, j=(0, len(X_test)-1))
In [ ]:
plot_10_by_10_images(images_test, figsize=(10,10))
On the (Small) Number of Atoms in the Universe
Let's switch from Go positions to digital pictures. There is an art project to display every possible picture. Surely that would take a long time, because there must be many possible pictures. But how many? We will assume the color model known as True Color, in which each pixel can be one of 224 ≅ 17 million distinct colors. The digital camera shown below left has 12 million pixels, and we'll also consider much smaller pictures: the array below middle, with 300 pixels, and the array below right with just 12 pixels; shown are some of the possible pictures:
Quiz: Which of these produces a number of pictures similar to the number of atoms in the universe?
**Answer: An array of n pixels produces (17 million)^n different pictures. (17 million)^12 ≅ 10^86, so the tiny 12-pixel array produces a million times more pictures than the number of atoms in the universe!
How about the 300 pixel array? It can produce 10^2167 pictures. You may think the number of atoms in the universe is big, but that's just peanuts to the number of pictures in a 300-pixel array. And 12M pixels? 10^86696638 pictures. Fuggedaboutit!
So the number of possible pictures is really, really, really big. And the number of atoms in the universe is looking relatively small, at least as a number of combinations.
In [ ]:
from keras.models import Sequential
nb_layers = 2
mlp2 = Sequential()
# add hidden layers
for i in range(nb_layers):
mlp2.add(Dense(output_dim=nb_hidden//nb_layers, input_dim=nb_input if i==0 else nb_hidden//nb_layers, init='uniform'))
mlp2.add(Activation('sigmoid'))
# add output layer
mlp2.add(Dense(output_dim=nb_output, input_dim=nb_hidden//nb_layers, init='uniform'))
mlp2.add(Activation('softmax'))
In [ ]:
mlp2.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=["accuracy"])
In [ ]:
mlp2.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)
Did you notice anything about the accuracy? Let's train it some more.
In [ ]:
mlp2.evaluate(X_test, y_test)
In [ ]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
mae = Sequential()
nb_layers = 1
encoder = []
decoder = []
for i in range(nb_layers):
if i>0:
encoder.append(Dropout(0.4))
encoder.append(Dense(output_dim=nb_hidden//nb_layers,
input_dim=nb_input if i==0 else nb_hidden//nb_layers,
init='glorot_uniform'))
encoder.append(Activation('sigmoid'))
# Note that these are in reverse order
decoder.append(Activation('sigmoid'))
decoder.append(Dense(output_dim=nb_input if i==0 else nb_hidden//nb_layers,
input_dim=nb_hidden//nb_layers,
init='glorot_uniform'))
#decoder.append(Dropout(0.2))
for layer in encoder:
mae.add(layer)
for layer in reversed(decoder):
mae.add(layer)
In [ ]:
from keras.optimizers import SGD
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
In [ ]:
mae.compile(loss='mse', optimizer='adam', metrics=["accuracy"]) # replace with sgd
In [ ]:
mae.fit(X_train, X_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)
In [ ]:
def draw_mae_prediction(j):
X_plot = X_test[j:j+1]
prediction = mae.predict(X_plot, verbose=False)
plot_1_by_2_images(to_images(X_plot)[0], to_images(prediction)[0])
interact(draw_mae_prediction, j=(0, len(X_test)-1))
In [ ]:
plot_10_by_10_images(images_test, figsize=(10,10))
In [ ]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
class StackedAutoencoder(object):
def __init__(self, layers, mode='autoencoder',
activation='sigmoid', init='uniform', final_activation='softmax',
dropout=0.2, optimizer='SGD', metrics=["accuracy"]):
self.layers = layers
self.mode = mode
self.activation = activation
self.final_activation = final_activation
self.init = init
self.dropout = dropout
self.optimizer = optimizer
self.metrics = metrics
self._model = None
self.build()
self.compile()
def _add_layer(self, model, i, is_encoder):
if is_encoder:
input_dim, output_dim = self.layers[i], self.layers[i+1]
activation = self.final_activation if i==len(self.layers)-2 else self.activation
else:
input_dim, output_dim = self.layers[i+1], self.layers[i]
activation = self.activation
model.add(Dense(output_dim=output_dim,
input_dim=input_dim,
init=self.init))
model.add(Activation(activation))
def build(self):
self.encoder = Sequential()
self.decoder = Sequential()
self.autoencoder = Sequential()
for i in range(len(self.layers)-1):
self._add_layer(self.encoder, i, True)
self._add_layer(self.autoencoder, i, True)
#if i<len(self.layers)-2:
# self.autoencoder.add(Dropout(self.dropout))
# Note that the decoder layers are in reverse order
for i in reversed(range(len(self.layers)-1)):
self._add_layer(self.decoder, i, False)
self._add_layer(self.autoencoder, i, False)
def compile(self):
print("Compiling the encoder ...")
self.encoder.compile(loss='categorical_crossentropy', optimizer=self.optimizer, metrics=self.metrics)
print("Compiling the decoder ...")
self.decoder.compile(loss='mse', optimizer=self.optimizer, metrics=self.metrics)
print("Compiling the autoencoder ...")
return self.autoencoder.compile(loss='mse', optimizer=self.optimizer, metrics=self.metrics)
def fit(self, X_train, Y_train, batch_size, nb_epoch, verbose=1):
result = self.autoencoder.fit(X_train, Y_train,
batch_size=batch_size, nb_epoch=nb_epoch,
verbose=verbose)
# copy the weights to the encoder
for i, l in enumerate(self.encoder.layers):
l.set_weights(self.autoencoder.layers[i].get_weights())
for i in range(len(self.decoder.layers)):
self.decoder.layers[-1-i].set_weights(self.autoencoder.layers[-1-i].get_weights())
return result
def pretrain(self, X_train, batch_size, nb_epoch, verbose=1):
for i in range(len(self.layers)-1):
# Greedily train each layer
print("Now pretraining layer {} [{}-->{}]".format(i+1, self.layers[i], self.layers[i+1]))
ae = Sequential()
self._add_layer(ae, i, True)
#ae.add(Dropout(self.dropout))
self._add_layer(ae, i, False)
ae.compile(loss='mse', optimizer=self.optimizer, metrics=self.metrics)
ae.fit(X_train, X_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose)
# Then lift the training data up one layer
print("\nTransforming data from", X_train.shape, "to", (X_train.shape[0], self.layers[i+1]))
enc = Sequential()
self._add_layer(enc, i, True)
enc.compile(loss='mse', optimizer=self.optimizer, metrics=self.metrics)
enc.layers[0].set_weights(ae.layers[0].get_weights())
enc.layers[1].set_weights(ae.layers[1].get_weights())
X_train = enc.predict(X_train, verbose=verbose)
print("\nShape check:", X_train.shape)
# Then copy the learned weights
self.encoder.layers[2*i].set_weights(ae.layers[0].get_weights())
self.encoder.layers[2*i+1].set_weights(ae.layers[1].get_weights())
self.autoencoder.layers[2*i].set_weights(ae.layers[0].get_weights())
self.autoencoder.layers[2*i+1].set_weights(ae.layers[1].get_weights())
self.decoder.layers[-1-(2*i)].set_weights(ae.layers[-1].get_weights())
self.decoder.layers[-1-(2*i+1)].set_weights(ae.layers[-2].get_weights())
self.autoencoder.layers[-1-(2*i)].set_weights(ae.layers[-1].get_weights())
self.autoencoder.layers[-1-(2*i+1)].set_weights(ae.layers[-2].get_weights())
def evaluate(self, X_test, Y_test):
return self.autoencoder.evaluate(X_test, Y_test)
def predict(self, X, verbose=False):
return self.autoencoder.predict(X, verbose=verbose)
def _get_paths(self, name):
model_path = "models/{}_model.yaml".format(name)
weights_path = "models/{}_weights.hdf5".format(name)
return model_path, weights_path
def save(self, name='autoencoder'):
model_path, weights_path = self._get_paths(name)
open(model_path, 'w').write(self.autoencoder.to_yaml())
self.autoencoder.save_weights(weights_path, overwrite=True)
def load(self, name='autoencoder'):
model_path, weights_path = self._get_paths(name)
self.autoencoder = keras.models.model_from_yaml(open(model_path))
self.autoencoder.load_weights(weights_path)
In [ ]:
sae = StackedAutoencoder(layers=[nb_input, 400, 100, 10],
activation='sigmoid',
final_activation='sigmoid',
init='uniform',
dropout=0.2,
optimizer='adam')
In [ ]:
nb_epoch = 3
In [ ]:
sae.pretrain(X_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)
In [ ]:
#sae.compile()
sae.fit(X_train, X_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)
In [ ]:
def draw_sae_prediction(j):
X_plot = X_test[j:j+1]
prediction = sae.predict(X_plot, verbose=False)
plot_1_by_2_images(to_images(X_plot)[0], to_images(prediction)[0])
print(sae.encoder.predict(X_plot, verbose=False)[0])
interact(draw_sae_prediction, j=(0, len(X_test)-1))
In [ ]:
plot_10_by_10_images(images_test, figsize=(10,10))
In [ ]:
sae.evaluate(X_test, X_test)
In [ ]:
def visualise_filter(model, layer_index, filter_index):
from keras import backend as K
# build a loss function that maximizes the activation
# of the nth filter on the layer considered
layer_output = model.layers[layer_index].get_output()
loss = K.mean(layer_output[:, filter_index])
# compute the gradient of the input picture wrt this loss
input_img = model.layers[0].input
grads = K.gradients(loss, input_img)[0]
# normalization trick: we normalize the gradient
grads /= (K.sqrt(K.mean(K.square(grads))) + 1e-5)
# this function returns the loss and grads given the input picture
iterate = K.function([input_img], [loss, grads])
# we start from a gray image with some noise
input_img_data = np.random.random((1,nb_input,))
# run gradient ascent for 20 steps
step = 1
for i in range(100):
loss_value, grads_value = iterate([input_img_data])
input_img_data += grads_value * step
#print("Current loss value:", loss_value)
if loss_value <= 0.:
# some filters get stuck to 0, we can skip them
break
print("Current loss value:", loss_value)
# decode the resulting input image
if loss_value>0:
#return input_img_data[0]
return input_img_data
else:
raise ValueError(loss_value)
In [ ]:
def draw_filter(i):
flt = visualise_filter(mlp, 3, 4)
#print(flt)
plot_mnist_digit(to_images(flt)[0])
interact(draw_filter, i=[0, 9])
In [ ]: